############### ###############
## cbo_regression1.R
## Project: CBO
## Author: Kamil Kouhen
## Date of creation: 01/05/2022
############### ###############

# Creating function to easily export regression results of multiple different models in same format adaptable to graph making (with ggplot) - mainly for graphs with treatment effects and quick estimations for preliminary blind analysis

cbo_regression1 <- function(df = "", #defining ds to be used between ""
                            variable.description = "", #If necessary, small description
                            specification = "OLS", #choosing model to be used between "" -  available as of 25/03/2022: OLS, TOBIT, LOGIT, Negative Binomials
                            outcomevar = "", #Defining outcome variables
                            treatmentvar, #Defining treatment variable
                            ind.vars = NULL, #Covariates to be used
                            stde = "HC0", #Robust standard error type (default is HC0)
                            FE = NULL, #Insert fixed effect (e.g. c("random_region", "situation"))
                            bootstrap = TRUE,
                            clustered.sd = NULL, #If clustered standard error, use variable identifying cluster, in factor() if necessary
                            weights = NULL, #If weights used, use weight variable
                            tobit.lower = NULL, #Only for tobit models, enter the lower bound between ""
                            tobit.upper = NULL,
                            standardized.outcome = FALSE,
                            AME = FALSE, #Average marginal effect instead of raw estimates?
                            delogchange = FALSE){ #Delogging log changes if necessary

  ## In order to make code shorter than for cbo_regression, using a more dynamic syntax ##

  #Checking first that specification exists
  if (specification != "OLS" & specification != "LOGIT" & specification != "Negative Binomials" & specification != "TOBIT"){
    stop("The model specified does not exist, please check the source code to see what model is currently available (OLS, LOGIT etc.)")
  }

  if (specification == "OLS"){
    spec <- "lm" #For paste0 to specify model be used for estimation
    addon <- ""
    whatmodel <- "OLS" #For column "model" in final table
  }
  if (specification == "LOGIT"){
    spec <- "glm" #For paste0 to specify model be used for estimation
    addon <- ", family = 'binomial'" #Necessary addon to glm for logit
    whatmodel <- "LOGIT" #For column "model" in final table
  }
  if (specification == "Negative Binomials"){
    spec <- "MASS::glm.nb" #For paste0 to specify model be used for estimation
    addon <- ""
    whatmodel <- "Neg. Binomials" #For column "model" in final table
  }

  if (specification == "TOBIT"){
    spec <- "AER::tobit"
    addon <- paste0(", dist = 'gaussian', left =", tobit.lower, ", right =", tobit.upper)
    whatmodel <- "TOBIT" #For column "model" in final table
  }

  ## There could be some cases where outcome variable is always equal to 1, logit therefore cannot compile
  if (((eval(parse(text = paste0("sd(", df, "$", outcomevar,", na.rm = T) == 0")))))){
    print(paste0("Warning: Outcome variable ", outcomevar, " has a zero variance. Estimation not possible."))
    return(
      tibble( #Creating empty outpu
        outcome = outcomevar,
        Description = variable.description,
        se = NA,
        Treatment = treatmentvar,
        estimate = NA,
        statistic = NA,
        p.value = NA,
        conf.low = NA,
        conf.high = NA,
        nb.obs = NA,
        model = spec,
        specification = "Not compiled",
        control.group.mean = NA
      )
    )
    break
  }

  ## Small adjustments to make the code work (+ between independent variable names, FE variables, and treatment variables) ##
  if (!missing(ind.vars) & !is.null(ind.vars)){ #Independent vars are specified in a vector, need to put them in "var1 + var2..." format
    ind.varsvec <- gsub(", "," + ", toString(ind.vars))
  }
  if (!missing(FE) & !is.null(FE)){
    FE.vec <- gsub(", "," + ", toString(FE))
  }

  ## Need to use a more dynamic way to add optional specifications ##

  #If no other covariates than treatment variables
  if (missing(ind.vars) | is.null(ind.vars)){
    ind.varsvec <- ""
    ind.varsyes <- ""
  }

  #If other covariates than treatment variables
  if (!missing(ind.vars) & !is.null(ind.vars)){
    ind.varsvec <- paste0(" + ", ind.varsvec)
    ind.varsyes <- "ind.vars "
  }

  #If no FE
  if (missing(FE) | is.null(FE)){
    FE.vec <- ""
    FEyes <- ""
  }

  #If FE
  if (!missing(FE) & !is.null(FE)){
    FE.vec <- paste0(" + ", FE.vec)
    FEyes <- "FE "
  }

  #If no weights
  if (missing(weights) | is.null(weights)){
    wgts <- ""
    wgtsyes <- ""
  }

  #If weights
  if (!missing(weights) & !is.null(weights)){
    wgts <- paste0(", weights = ", weights)
    wgtsyes <- "wgts "
  }

  ## Creating estimations and output ##
  if (standardized.outcome == F | missing(standardized.outcome)){
    cmd1 <- paste0(spec, "(", outcomevar, "~", treatmentvar, ind.varsvec, FE.vec, " , data = ", df, ", na.action(na.omit)", wgts, addon, ")")
    stdoutcome <- ""
  }
  if (standardized.outcome == T){
    cmd1 <- paste0(spec, "(scale(", outcomevar, ")~", treatmentvar, ind.varsvec, FE.vec, " , data = ", df, ", na.action(na.omit)", wgts, addon, ")")
    stdoutcome <- "std."
  }

  #If non-clustered se
  if (missing(clustered.sd) | is.null(clustered.sd)){
    if (bootstrap != TRUE){
      #Raw estimates (with confidence interval)
      cmd2 <- paste0("tidy(coeftest(", cmd1, ",
                                       vcov = vcovHC(", cmd1, ",
                                                         type ='", stde, "')),
                           conf.int = TRUE) %>%
                     filter(term =='", treatmentvar, "1' | term =='", treatmentvar, "')") #Only keep coefficient of treatment variable
      regr <- eval(parse(text = cmd2)) #Extracting in list format to be able to reference with "$" later

      #Marginal effect for treatment variable, if specified as argument, average marginal effect will replace coefficient (for logit)
      if (specification == "LOGIT" & AME == T){ #For easier use, extracting from tibble format
        AME.margin <- eval(parse(text = paste0("as.list(summary(
          margins(", cmd1, ",
                  data =", df, ",
                  type = 'response'),
                  vcov = vcovHC(", cmd1, ", type = '", stde, "')) %>%
          filter(factor =='", treatmentvar, "1' | factor =='", treatmentvar, "'))")))  #Extracting in list format to be able to reference with "$" later
        #Filtered twice in case treatment variable is numeric and does not have "1" at the end of the regression output name
        AMEyes <- "AME"
      } else{
        AMEyes <- ""
      }
      bootstrapyes <- ""
    }

    if (bootstrap == TRUE | missing(bootstrap)){
      #Raw estimates (with confidence interval)
      cmd2 <- paste0("tidy(coeftest(", cmd1, ",
                                     vcov = vcovBS(", cmd1, ",
                                                       R = 250)),
                         conf.int = TRUE) %>%
                   filter(term =='", treatmentvar, "1' | term =='", treatmentvar, "')") #Only keep coefficient of treatment variable
      regr <- eval(parse(text = cmd2)) #Extracting in list format to be able to reference with "$" later

      #Marginal effect for treatment variable, if specified as argument, average marginal effect will replace coefficient (for logit)
      if (specification == "LOGIT" & AME == T){ #For easier use, extracting from tibble format
        AME.margin <- eval(parse(text = paste0("as.list(summary(
        margins(", cmd1, ",
                data =", df, ",
                type = 'response'),
                vcov = vcovBS(", cmd1, ", R = 250)) %>%
        filter(factor =='", treatmentvar, "1' | factor =='", treatmentvar, "'))")))  #Extracting in list format to be able to reference with "$" later
        #Filtered twice in case treatment variable is numeric and does not have "1" at the end of the regression output name
        AMEyes <- "AME"
      } else{
        AMEyes <- ""
      }

      bootstrapyes <- "BS"
    }
    clusteryes <- ""
  }


  #If clustered se
  if (!missing(clustered.sd) & !is.null(clustered.sd)){
    if (bootstrap == FALSE){
      #Raw estimates (with confidence interval)
      cmd2 <- paste0("tidy(coeftest(", cmd1, ",
                                                  vcov = vcovCL(", cmd1, ",
                                                                  cluster = factor(", df, "$", clustered.sd,"),
                                                                  type ='", stde, "')),
                                    conf.int = TRUE) %>%
                      filter(term =='", treatmentvar, "1' | term =='", treatmentvar, "')") #Only keep coefficient of treatment variable
      regr <- eval(parse(text = cmd2)) #Extracting in list format to be able to reference with "$" later

      #Marginal effect for treatment variable, if specified as argument, average marginal effect will replace coefficient (for logit)
      if (specification == "LOGIT" & AME == T){ #For easier use, extracting from tibble format
        AME.margin <- eval(parse(text = paste0("as.list(summary(
          margins(", cmd1, ",
            data =", df, ",
            type = 'response'),
            vcov = vcovCL(", cmd1, ", cluster = factor(", df, "$", clustered.sd,"), type ='", stde, "')) %>%
          filter(factor =='", treatmentvar, "1' | factor =='", treatmentvar, "'))")))  #Extracting in list format to be able to reference with "$" later
        #Filtered twice in case treatment variable is numeric and does not have "1" at the end of the regression output name
        AMEyes <- "AME"
      } else{
        AMEyes <- ""
      }
      bootstrapyes <- ""
    }

    if (bootstrap == TRUE | missing(bootstrap)){
      #Raw estimates (with confidence interval)
      cmd2 <- paste0("tidy(coeftest(", cmd1, ",
                                       vcov = vcovBS(", cmd1, ",
                                                         cluster = factor(", df, "$", clustered.sd,"),
                                                         R = 250)),
                           conf.int = TRUE) %>%
                     filter(term =='", treatmentvar, "1' | term =='", treatmentvar, "')") #Only keep coefficient of treatment variable
      regr <- eval(parse(text = cmd2)) #Extracting in list format to be able to reference with "$" later

      #Marginal effect for treatment variable, if specified as argument, average marginal effect will replace coefficient (for logit)
      if (specification == "LOGIT" & AME == T){ #For easier use, extracting from tibble format
        AME.margin <- eval(parse(text = paste0("as.list(summary(
          margins(", cmd1, ",
            data =", df, ",
            type = 'response'),
            vcov = vcovBS(", cmd1, ", cluster = factor(", df, "$", clustered.sd,"), R = 250)) %>%
          filter(factor =='", treatmentvar, "1' | factor =='", treatmentvar, "'))")))  #Extracting in list format to be able to reference with "$" later
        #Filtered twice in case treatment variable is numeric and does not have "1" at the end of the regression output name
        AMEyes <- "AME"
      } else{
        AMEyes <- ""
      }
      bootstrapyes <- "BS"
    }
    clusteryes <- "clustered.se"
  }

  #Returning results
  return(regr %>% #Raw estimates are already in tibble class
           mutate(outcome = outcomevar) %>%
           mutate(Description = variable.description) %>% #Variable description
           rename("Treatment" = term)  %>%
           mutate(estimate = ifelse(AME == T, AME.margin$AME, estimate)) %>% #If Average marginal effects are needed instead of raw estimates
           mutate(estimate = ifelse(delogchange == T, exp(estimate) - 1, estimate)) %>% #If need de-logged estimates
           rename("se" = std.error) %>%
           mutate(se = ifelse(delogchange == T, exp(se) - 1, se)) %>% #If need de-logged
           mutate(se = ifelse(AME == T, AME.margin$SE, se)) %>%
           mutate(p.value = ifelse(AME == T, AME.margin$p, p.value)) %>%
           mutate(conf.low = ifelse(AME == T, AME.margin$lower, conf.low)) %>%
           mutate(conf.low = ifelse(delogchange == T, exp(conf.low) - 1, conf.low)) %>% #If need de-logged
           mutate(conf.high = ifelse(AME == T, AME.margin$upper, conf.high)) %>%
           mutate(conf.high = ifelse(delogchange == T, exp(conf.high) - 1, conf.high)) %>% #If need de-logged
           mutate("nb.obs" = eval(parse(text = paste0("nobs(", cmd1,")")))) %>%
           mutate("model" = whatmodel) %>%
           mutate(specification = paste0(FEyes, wgtsyes, ind.varsyes, clusteryes, stdoutcome, AMEyes, bootstrapyes)) %>%
           mutate(control.group.mean = ifelse(standardized.outcome == TRUE,
                                              eval(parse(text = paste0("round(mean(scale((", df, "%>% filter(", treatmentvar, "== 0))$", outcomevar,"), na.rm = TRUE), digits = 2)"))),
                                              eval(parse(text = paste0("round(mean((", df, "%>% filter(", treatmentvar, "== 0))$", outcomevar,", na.rm = TRUE), digits = 2)"))))) %>%
           mutate(control.group.mean = ifelse(delogchange == TRUE,
                                              eval(parse(text = paste0("round(exp(mean((", df, "%>% filter(", treatmentvar, "== 0))$", outcomevar,", na.rm = TRUE)), digits = 2)"))),
                                              control.group.mean)) %>%
           select(outcome, Description, estimate, se, p.value, conf.low, conf.high, control.group.mean, nb.obs, model, specification) %>%
           mutate(across(where(is.numeric), round, 2)) #rounding to 2 decimals
  )

} #End of cbo_regression1
